import pickle
import re
import zhconv
import jieba
import jieba.posseg as psg
import joblib
from sklearn.feature_extraction.text import CountVectorizer

def clean_data(content):
    # 1. 去除非中文字符
    content = re.sub(r'[^\u4e00-\u9fa5]','',content)
    # 2. 繁体字转简体
    content = zhconv.convert(content,'zh-cn')
    # 3. 分词，过滤词性
    content_pos = psg.cut(content) # 同时返回分词结果和每个词的词性标签
    allow_pos = ['n', 'nr', 'ns', 'nt', 'v','a']
    words = []
    for word,pos in content_pos:
        if pos in allow_pos:
            words.append(word)
    return words


def predict(email):
    # 特征提取器
    vocab = pickle.load(open('data/03_模型训练特征.pkl', 'rb'))
    transfer = CountVectorizer(max_features=10000,vocabulary=vocab)

    # 模型加载
    model = joblib.load('data/04_垃圾邮件分类模型.pth')

    # 数据清洗
    email = clean_data(email)

    # 特征提取
    email_features = transfer.transform(email)

    # 模型预测
    email_pre = model.predict(email_features)

    print(email_pre[0])

if __name__ == '__main__':
    email1 = '''
Received: from coozo.com ([219.133.254.230])
by spam-gw.ccert.edu.cn (MIMEDefang) with ESMTP id j8L2Zoqi028766
for <li@ccert.edu.cn>; Fri, 23 Sep 2005 13:01:45 +0800 (CST)
Message-ID: <200509211035.j8L2Zoqi028766@spam-gw.ccert.edu.cn>
From: "you" <you@coozo.com>
Subject: =?gb2312?B?us/X9w==?=
To: li@ccert.edu.cn
Content-Type: text/plain;charset="GB2312"
Content-Transfer-Encoding: 8bit
Date: Sun, 23 Oct 2005 23:44:32 +0800
X-Priority: 3
X-Mailer: Microsoft Outlook Express 6.00.2800.1106
您好！
我公司有多余的发票可以向外代开！（国税、地税、运输、广告、海关缴款书）。
如果贵公司（厂）有需要请来电洽谈、咨询！
联系电话: 013510251389 陈先生
谢谢
顺祝商祺!
'''
    email2 = '''
Received: from web15010.mail.cnb.yahoo.com (web15010.mail.cnb.yahoo.com
[202.165.103.67])
by spam-gw.ccert.edu.cn (MIMEDefang) with ESMTP id j8R8H2V8018468
for <hu@ccert.edu.cn>; Thu, 29 Sep 2005 19:39:41 +0800 (CST)
Received: (qmail 54688 invoked by uid 60001); Thu, 29 Sep 2005 11:50:48 -0000
DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws;
s=s1024; d=yahoo.com.cn;
h=Message-ID:Received:Date:From:Subject:To:MIME-Version:Content-Type:Content￾Transfer-Encoding;
b=bSU/zJOkkJfDLFBbWnWnTUKDZWedZej7CHwk+68TMJOxc5bWNOV3oFm+Sdj7+BguqbdY8hBnj9by0v
LAREwvNsRCI/vWqZokpQhqNS620fenBohJKxF1JDhRipTl6dha0/sPi1Z9L+cjbm98QQkoNFkiZSBiuB
y63tmjYznR3JE= ;
Message-ID: <20050927082809.54686.qmail@web15010.mail.cnb.yahoo.com>
Received: from [61.150.43.113] by web15010.mail.cnb.yahoo.com via HTTP; Thu, 29
Sep 2005 19:50:48 CST
Date: Thu, 29 Sep 2005 19:50:48 +0800 (CST)
From: liang ming <yang@yahoo.com.cn>
Subject: =?gb2312?B?UmU6ILOzvNzKscTQxfPT0cDPt62z9tLUx7C1xMrCx+k=?=
To: hu@ccert.edu.cn
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="0-1710224003-1127809689=:53686"
Content-Transfer-Encoding: 8bit
我怎么觉得是你在翻..
标 题: 吵架时男朋友老翻出以前的事情
我觉得吵完了和好了就过去了，他却总是在下一次吵架的时候提起。是不是心胸不够宽
阔？老这样下去伤心死了。经常是吵完了我哭他不理我，后来太晚了他就搂着我拍拍我然
后天亮了我们都要去上班。昨天他说他想要的太多了，得到的太少了。我说我从来不觉得
我付出的少，他就质问我付出了什么。我为了他离开了以前的男朋友，办好了去日本的签
证而没去，离开了大连在这里辛苦的生活。远离了一些朋友，工资没怎么涨，每天忍受着
一个半小时的公交车，饭费房租都是2倍而房子却不是精装修也没有家电，忍受着电梯和
楼下汽车的噪音。听他那么说真伤心，觉得自己的爱在消减，好担心会不爱了。
程序输出结果:
--
'''
    email3 = '''你真是一个好人啊，我司诚挚邀您入职'''

    predict(email1)
    predict(email2)
    predict(email3)